Binome : - Wacim BELAHCEL - Imad Oualid KACIMI
#Partie A :
library(FactoMineR)
library(stats)
spam=read.table("https://www.math.univ-toulouse.fr/~besse/Wikistat/data/spam.dat",header=TRUE)
dim(spam)
[1] 4601 58
names(spam)
[1] "spam" "make" "address" "all" "X3d" "our" "over" "remove" "internet"
[10] "order" "mail" "receive" "will" "people" "report" "addresses" "free" "business"
[19] "email" "you" "credit" "your" "font" "X000" "money" "hp" "hpl"
[28] "george" "X650" "lab" "labs" "telnet" "X857" "data" "X415" "X85"
[37] "technology" "X1999" "parts" "pm" "direct" "cs" "meeting" "original" "project"
[46] "re" "edu" "table" "conference" "CsemiCol" "Cpar" "Ccroch" "Cexclam" "Cdollar"
[55] "Cdiese" "CapLM" "CapLsup" "CapLtot"
spam[,1]=as.factor(spam[,1])
spam
NA
les données sont trés asysemtrique avec des valeurs max tres eloigné du centre (skewed), la normalisation log rapproche les données leurs donne une forme un peu plus normal
Lspam=data.frame("spam"=spam[,1],log(1+spam[,2:58]))
Lspam
summary(spam)
spam make address all X3d our over
0:2788 Min. :0.0000 Min. : 0.000 Min. :0.0000 Min. : 0.00000 Min. : 0.0000 Min. :0.0000
1:1813 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.: 0.0000 1st Qu.:0.0000
Median :0.0000 Median : 0.000 Median :0.0000 Median : 0.00000 Median : 0.0000 Median :0.0000
Mean :0.1046 Mean : 0.213 Mean :0.2807 Mean : 0.06542 Mean : 0.3122 Mean :0.0959
3rd Qu.:0.0000 3rd Qu.: 0.000 3rd Qu.:0.4200 3rd Qu.: 0.00000 3rd Qu.: 0.3800 3rd Qu.:0.0000
Max. :4.5400 Max. :14.280 Max. :5.1000 Max. :42.81000 Max. :10.0000 Max. :5.8800
remove internet order mail receive will people
Min. :0.0000 Min. : 0.0000 Min. :0.00000 Min. : 0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000
1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
Median :0.0000 Median : 0.0000 Median :0.00000 Median : 0.0000 Median :0.00000 Median :0.1000 Median :0.00000
Mean :0.1142 Mean : 0.1053 Mean :0.09007 Mean : 0.2394 Mean :0.05982 Mean :0.5417 Mean :0.09393
3rd Qu.:0.0000 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.: 0.1600 3rd Qu.:0.00000 3rd Qu.:0.8000 3rd Qu.:0.00000
Max. :7.2700 Max. :11.1100 Max. :5.26000 Max. :18.1800 Max. :2.61000 Max. :9.6700 Max. :5.55000
report addresses free business email you credit
Min. : 0.00000 Min. :0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. : 0.00000
1st Qu.: 0.00000 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.: 0.00000
Median : 0.00000 Median :0.0000 Median : 0.0000 Median :0.0000 Median :0.0000 Median : 1.310 Median : 0.00000
Mean : 0.05863 Mean :0.0492 Mean : 0.2488 Mean :0.1426 Mean :0.1847 Mean : 1.662 Mean : 0.08558
3rd Qu.: 0.00000 3rd Qu.:0.0000 3rd Qu.: 0.1000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 2.640 3rd Qu.: 0.00000
Max. :10.00000 Max. :4.4100 Max. :20.0000 Max. :7.1400 Max. :9.0900 Max. :18.750 Max. :18.18000
your font X000 money hp hpl george
Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. : 0.00000 Min. : 0.0000 Min. : 0.0000 Min. : 0.0000
1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000
Median : 0.2200 Median : 0.0000 Median :0.0000 Median : 0.00000 Median : 0.0000 Median : 0.0000 Median : 0.0000
Mean : 0.8098 Mean : 0.1212 Mean :0.1016 Mean : 0.09427 Mean : 0.5495 Mean : 0.2654 Mean : 0.7673
3rd Qu.: 1.2700 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
Max. :11.1100 Max. :17.1000 Max. :5.4500 Max. :12.50000 Max. :20.8300 Max. :16.6600 Max. :33.3300
X650 lab labs telnet X857 data X415
Min. :0.0000 Min. : 0.00000 Min. :0.0000 Min. : 0.00000 Min. :0.00000 Min. : 0.00000 Min. :0.00000
1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.:0.00000 1st Qu.: 0.00000 1st Qu.:0.00000
Median :0.0000 Median : 0.00000 Median :0.0000 Median : 0.00000 Median :0.00000 Median : 0.00000 Median :0.00000
Mean :0.1248 Mean : 0.09892 Mean :0.1029 Mean : 0.06475 Mean :0.04705 Mean : 0.09723 Mean :0.04784
3rd Qu.:0.0000 3rd Qu.: 0.00000 3rd Qu.:0.0000 3rd Qu.: 0.00000 3rd Qu.:0.00000 3rd Qu.: 0.00000 3rd Qu.:0.00000
Max. :9.0900 Max. :14.28000 Max. :5.8800 Max. :12.50000 Max. :4.76000 Max. :18.18000 Max. :4.76000
X85 technology X1999 parts pm direct cs
Min. : 0.0000 Min. :0.00000 Min. :0.000 Min. :0.0000 Min. : 0.00000 Min. :0.00000 Min. :0.00000
1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.:0.00000 1st Qu.:0.00000
Median : 0.0000 Median :0.00000 Median :0.000 Median :0.0000 Median : 0.00000 Median :0.00000 Median :0.00000
Mean : 0.1054 Mean :0.09748 Mean :0.137 Mean :0.0132 Mean : 0.07863 Mean :0.06483 Mean :0.04367
3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.: 0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :20.0000 Max. :7.69000 Max. :6.890 Max. :8.3300 Max. :11.11000 Max. :4.76000 Max. :7.14000
meeting original project re edu table conference
Min. : 0.0000 Min. :0.0000 Min. : 0.0000 Min. : 0.0000 Min. : 0.0000 Min. :0.000000 Min. : 0.00000
1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.000000 1st Qu.: 0.00000
Median : 0.0000 Median :0.0000 Median : 0.0000 Median : 0.0000 Median : 0.0000 Median :0.000000 Median : 0.00000
Mean : 0.1323 Mean :0.0461 Mean : 0.0792 Mean : 0.3012 Mean : 0.1798 Mean :0.005444 Mean : 0.03187
3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.1100 3rd Qu.: 0.0000 3rd Qu.:0.000000 3rd Qu.: 0.00000
Max. :14.2800 Max. :3.5700 Max. :20.0000 Max. :21.4200 Max. :22.0500 Max. :2.170000 Max. :10.00000
CsemiCol Cpar Ccroch Cexclam Cdollar Cdiese CapLM
Min. :0.00000 Min. :0.000 Min. :0.00000 Min. : 0.0000 Min. :0.00000 Min. : 0.00000 Min. : 1.000
1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 0.00000 1st Qu.: 1.588
Median :0.00000 Median :0.065 Median :0.00000 Median : 0.0000 Median :0.00000 Median : 0.00000 Median : 2.276
Mean :0.03857 Mean :0.139 Mean :0.01698 Mean : 0.2691 Mean :0.07581 Mean : 0.04424 Mean : 5.191
3rd Qu.:0.00000 3rd Qu.:0.188 3rd Qu.:0.00000 3rd Qu.: 0.3150 3rd Qu.:0.05200 3rd Qu.: 0.00000 3rd Qu.: 3.706
Max. :4.38500 Max. :9.752 Max. :4.08100 Max. :32.4780 Max. :6.00300 Max. :19.82900 Max. :1102.500
CapLsup CapLtot
Min. : 1.00 Min. : 1.0
1st Qu.: 6.00 1st Qu.: 35.0
Median : 15.00 Median : 95.0
Mean : 52.17 Mean : 283.3
3rd Qu.: 43.00 3rd Qu.: 266.0
Max. :9989.00 Max. :15841.0
library(Factoshiny)
le package 㤼㸱Factoshiny㤼㸲 a 攼㸹t攼㸹 compil攼㸹 avec la version R 4.0.3Le chargement a n攼㸹cessit攼㸹 le package : shiny
le package 㤼㸱shiny㤼㸲 a 攼㸹t攼㸹 compil攼㸹 avec la version R 4.0.3Le chargement a n攼㸹cessit攼㸹 le package : FactoInvestigate
le package 㤼㸱FactoInvestigate㤼㸲 a 攼㸹t攼㸹 compil攼㸹 avec la version R 4.0.3Le chargement a n攼㸹cessit攼㸹 le package : ggplot2
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Factoshiny(Lspam)
Listening on http://127.0.0.1:3242
NA
res.PCA<-PCA(Lspam,quali.sup=c(1),graph=FALSE)
plot.PCA(res.PCA,choix='var',select='contrib 57',unselect=0,title="Graphe des variables de l'ACP",col.quanti.sup='#0000FF')
plot.PCA(res.PCA,invisible=c('ind.sup'),select='contrib 402',habillage=1,title="Graphe des individus de l'ACP",label ='none')
les données ne sont pas trés bien représenté, une minorité d’indidividu participe fortement à la création des axes, le reste sont rassemblé autour de l’origine. (1754, 1489, 904…) les variable Caplsup et Capltot et Caplm participe fortement à la creation des axes (l’axe 1 => Capltot et l’axe 2 => CaplSup Caplm)
res.pca=PCA(spam,scale.unit = FALSE,quali.sup=1)
NA
NA
pas de gros changement dans la contribution des individu (il y’a toujours des individu qui contribue trop fortement à la creation des axex) cependant cela donne une meilleurs visualisation et cela met toutes les variable sur un meme scale ce qui atténue la contribution des variable CAP il y’a donc plus de variable qui conribue à la creationdes axes.
res.pca1=PCA(spam,scale.unit = TRUE,quali.sup=1)
la log transform atténue la contribution des individus cité precedement, cependant nous perdons le scaling de nos variable cité predecement et avons donc le méme probléme que dans la premiére représentation, c’est à dire que certaines variable (CAPS) contribue trop fortement à la création des axes.
res.pca=PCA(Lspam,scale.unit = FALSE, quali.sup=1)
on retrouve l’avantage qu’offre le scaling sur nos variable qui attenue la contribution des variable caps sur nos axes (bien que dans ce cas nous voyons une forte contribution sur l’axe 2), les individu paraisse un peu mieux séparé que sur le premier cas, avec une contribution attenué de nos outlier (on remarque particulierement que l’individu 1754 n’est pas celui qui contribue le plus à la creation de nos axes) nous remarqu’on quand méme une contribution plus forte de certains.
res.pca=PCA(Lspam,scale.unit = TRUE,quali.sup=1)
en utilisant la command si dessous, on peut voir qu’il y’a une forte correlation avec certains mots technique sur l’aXe 1 (X857, X415, telnet, labs X85…), on peut donc en déduire que l’axe 1 décrit la nature d’un email l’axe 2 quand a lui est est trés corrélé avec les variable CAPLsup, CapLtot, CapLM, nous pouvons donc en déduire que cette axe décrit la syntax utilisé dans l’écriture de l’email
barplot(res.pca$eig[,1],main="Eigenvalues",
names.arg=1:nrow(res.pca$eig))
plot(res.pca,choix="ind",habillage=1,
lcex=0.5,label ='none')
plot(res.pca,choix="var")
dimdesc(res.pca,axes=c(1,2))
$Dim.1
$quanti
correlation p.value
X857 0.79066959 0.000000e+00
X415 0.78555109 0.000000e+00
telnet 0.76204500 0.000000e+00
labs 0.70881830 0.000000e+00
X85 0.70817912 0.000000e+00
technology 0.70334461 0.000000e+00
X650 0.69203040 0.000000e+00
direct 0.65766301 0.000000e+00
hp 0.63683943 0.000000e+00
hpl 0.63439947 0.000000e+00
lab 0.62041777 0.000000e+00
Cpar 0.40465161 7.645503e-181
george 0.36616112 5.538570e-146
original 0.29556958 2.023646e-93
X1999 0.23615701 2.438071e-59
pm 0.21565173 1.502880e-49
re 0.13118689 4.082323e-19
Ccroch 0.11438785 7.113190e-15
meeting 0.10767117 2.429956e-13
project 0.09464677 1.253952e-10
data 0.07386856 5.274568e-07
cs 0.05790514 8.495480e-05
conference 0.05088044 5.553092e-04
edu 0.03734280 1.130321e-02
X3d -0.04051672 5.983979e-03
Cdiese -0.07361473 5.769921e-07
font -0.07693448 1.743019e-07
report -0.10236108 3.411893e-12
address -0.11093451 4.487672e-14
will -0.13979732 1.622304e-21
mail -0.15568246 2.354610e-26
email -0.15804739 4.034904e-27
addresses -0.17672310 1.360757e-33
people -0.18839781 5.040241e-38
internet -0.19739932 1.202802e-41
credit -0.20650277 1.708547e-45
make -0.20948203 8.567055e-47
our -0.21398447 8.515885e-49
over -0.23194822 2.994698e-57
business -0.24602028 2.121799e-64
order -0.24697247 6.697550e-65
all -0.24715035 5.396653e-65
remove -0.24983313 2.033867e-66
receive -0.25281179 5.095608e-68
CapLM -0.26408317 2.840356e-74
free -0.26707702 5.494297e-76
money -0.27122292 2.140120e-78
CapLtot -0.27141499 1.651094e-78
X000 -0.29022013 5.459779e-90
Cdollar -0.31640994 1.668202e-107
CapLsup -0.32317975 2.490732e-112
Cexclam -0.32787261 9.488781e-116
you -0.34445476 2.526735e-128
your -0.37699966 2.499372e-155
$quali
R2 p.value
spam 0.267884 9.074311e-314
$category
Estimate p.value
spam=spam_0 1.462728 9.074311e-314
spam=spam_1 -1.462728 9.074311e-314
attr(,"class")
[1] "condes" "list "
$Dim.2
$quanti
correlation p.value
CapLsup 0.72523309 0.000000e+00
CapLtot 0.66730276 0.000000e+00
CapLM 0.60324096 0.000000e+00
your 0.46858877 6.522763e-250
direct 0.45688345 3.911215e-236
Cdollar 0.43078524 2.734456e-207
order 0.41853399 1.341555e-194
X415 0.40736184 1.773855e-183
X857 0.40578506 6.089812e-182
X000 0.40553726 1.059739e-181
telnet 0.37822157 2.096976e-156
mail 0.37147888 1.595909e-150
business 0.35158337 5.696451e-134
receive 0.35104075 1.551073e-133
addresses 0.34629333 9.124239e-130
money 0.33182171 1.128983e-118
Cexclam 0.32929741 8.454184e-117
technology 0.31891269 2.835938e-109
X85 0.31644043 1.587795e-107
all 0.31618659 2.395175e-107
labs 0.30330520 1.632053e-98
email 0.30315246 2.064557e-98
credit 0.30112058 4.645968e-97
you 0.29223178 2.855161e-91
over 0.28874873 4.655178e-89
X650 0.28749436 2.864330e-88
make 0.27524520 8.950824e-81
internet 0.27450548 2.467960e-80
remove 0.27210950 6.451112e-79
our 0.27143250 1.612505e-78
free 0.24633045 1.458186e-64
lab 0.23116891 7.221923e-57
will 0.22868177 1.172869e-55
people 0.21905956 4.143574e-51
Cpar 0.20815687 3.261804e-46
hpl 0.16009187 8.588623e-28
report 0.15429484 6.544370e-26
Cdiese 0.14240422 2.835708e-22
hp 0.14107988 6.906375e-22
address 0.10212039 3.833967e-12
font 0.06464657 1.141773e-05
original 0.06442883 1.222098e-05
X3d 0.04731766 1.324976e-03
X1999 -0.03675396 1.265936e-02
parts -0.04376914 2.982784e-03
conference -0.07868051 9.102437e-08
project -0.10617939 5.172766e-13
data -0.11432719 7.350659e-15
cs -0.11730308 1.438002e-15
meeting -0.13402722 6.855976e-20
re -0.16137425 3.220366e-28
george -0.17890761 2.124936e-34
edu -0.20286557 6.198031e-44
$quali
R2 p.value
spam 0.2905246 0
$category
Estimate p.value
spam=spam_1 1.251124 0
spam=spam_0 -1.251124 0
attr(,"class")
[1] "condes" "list "
$call
$call$num.var
[1] 1
$call$proba
[1] 0.05
$call$weights
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[63] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[125] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[187] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[249] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[311] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[373] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[435] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[497] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[559] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[621] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[683] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[745] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[807] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[869] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[931] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[993] 1 1 1 1 1 1 1 1
[ reached getOption("max.print") -- omitted 3601 entries ]
$call$X
NANA
le critére de ward est beaucoup mieux adapté pour des classes qui ne sont pas allongé et bien séparé nous remarquons dans ce cas qu’il arrive à séparé nos variable en 2 à 3 catégorie distincts (via la methode du coude) nous retrouvons effectivemnt des lements d’interpretation de notre ACP precedente, les variable les plus corrélé à l’axe 1 se retrouve dans un cluster séparé, et celle corrélé à l’axe 2 dans un deuxieme cluster. Aussi on peut remarquer que la distance utilisée est basée sur la correlation entre les variables, sachant que pour le nuage de variable obtenue en applicant une pca, sur un tableau reduit centré la distance entre 2 variables est de la meme formule.
dist.var<-as.dist(1-cor(Lspam[2:58])**2)
clas.var<-hclust(dist.var,method="ward.D2")
plot(clas.var)
plot(clas.var$height[56:40])
NA
NA
La rerresentation est trés simmilaire à celle de l’acp car nos variable sont centrées reduites, et la distance utilisée est semblable à la distance obtenue entre 2 variables en appliquant une acp sur un nuage de variable centré réduit
rS = cor(Lspam[2:58])
dS2=sqrt(1-rS**2)
dN=dimnames(Lspam[2:58])[[2]]
mdspam= cmdscale(dS2, k=2)
plot(mdspam, type="n", xlab="", ylab="",main="")
text(mdspam,dN)
abline(v=0,h=0)
mdspam
[,1] [,2]
make 0.08709778 -0.040659286
address 0.07207711 -0.074915575
all 0.09116118 -0.008683126
X3d 0.07156140 -0.075261502
our 0.08637958 -0.043617773
over 0.08679503 -0.030061747
remove 0.08530228 -0.029721544
internet 0.08574460 -0.040152259
order 0.11254518 0.119206449
mail 0.09828903 0.028397962
receive 0.09590606 -0.013229860
will 0.08214607 -0.043280639
people 0.08066172 -0.051402615
report 0.07765710 -0.047442618
addresses 0.09764812 0.004542300
free 0.07955979 -0.048218962
business 0.09610362 -0.008377008
email 0.08827012 -0.044059115
you 0.08288234 -0.070796294
credit 0.09103239 -0.008661246
your 0.11284430 0.021281190
font 0.08397321 -0.069314458
X000 0.10608838 0.028053434
money 0.09262995 -0.018187184
hp -0.17750399 -0.065377534
hpl -0.18518433 -0.063332063
george 0.04058744 0.055237137
X650 -0.31284140 -0.000312009
lab -0.25525046 0.021990185
labs -0.34920332 0.019231755
telnet -0.46143873 0.063880617
X857 -0.52925021 0.094289943
data 0.07397199 -0.087513230
X415 -0.52508818 0.093494670
X85 -0.35437941 0.016066219
technology -0.35683971 0.030200559
X1999 0.06445009 -0.121538123
parts 0.07095749 -0.085320359
pm 0.05928039 -0.105277299
direct -0.41476343 0.073941018
cs 0.07739942 -0.107030737
meeting 0.06836616 -0.068621491
original 0.03168246 -0.110080398
project 0.07165949 -0.074179638
re 0.07731757 -0.054036375
edu 0.07938736 -0.098509009
table 0.07029244 -0.084491798
conference 0.07129614 -0.085678164
CsemiCol 0.07708366 -0.088858753
Cpar -0.06220126 -0.046319657
Ccroch 0.06877929 -0.093920917
Cexclam 0.08878751 0.008618409
Cdollar 0.11032784 0.073330375
Cdiese 0.08276062 -0.055731580
CapLM 0.16870907 0.449103504
CapLsup 0.20325817 0.571092849
CapLtot 0.18323348 0.490213373
sur le plot des variable de l’acp on peut voir qu’il y’a 4 cluster de variable, par exemple les points bleus sont les plus corrélés avec l’axe 2 et ceux qui contribue le plus à sa création.
classes <- cutree(clas.var,k=4)
sort(classes)
make address all X3d our over remove internet
1 1 1 1 1 1 1 1
order mail receive will people report addresses free
1 1 1 1 1 1 1 1
business email you credit your X000 money george
1 1 1 1 1 1 1 1
data X1999 parts pm cs meeting original project
1 1 1 1 1 1 1 1
re edu table conference Cpar Ccroch Cexclam Cdollar
1 1 1 1 1 1 1 1
font CsemiCol Cdiese hp hpl X650 lab labs
2 2 2 3 3 3 3 3
telnet X857 X415 X85 technology direct CapLM CapLsup
3 3 3 3 3 3 4 4
CapLtot
4
names(classes[classes==2]) #variables de la classe 2
[1] "font" "CsemiCol" "Cdiese"
coul = classes
plot(mdspam, type="n", xlab="Dimension 1",
ylab="Dimension 2", main="CAH euclid")
text(mdspam,dN,col=coul)
Lecture du fichier
spam.quali <- read.table("https://www.math.univ-toulouse.fr/~besse/Wikistat/data/spamq.dat")
spam.quali
NA
NA
La discrimination lineaire ne parrait pas etre une approche rentable, bien que l’on voit bien qu’il exite deux cluster disctint il y a un chevauchement important entre les 2 classes.
afc=MCA(spam.quali,quali.sup=c(32,34,58))
plot.MCA(afc,invisible=c("ind"),col.var="blue")
# avec un zoom
plot.MCA(afc,invisible=c("ind"),col.var="blue",
xlim=c(-1,1),ylim=c(-1,1))
# les messages en couleur
plot(afc$ind$coord,type="p",pch=".",cex=2,col=as.factor(spam.quali[,58]),xlim=c(-1,1),ylim=c(-1,1))
NA
NA
NA
Les classe semble simmilaire entre le hclust et le kmeans donc on peut dire que les classes sont stables
dist.mod=dist(afc$var$coord, method="euclidean")
hclusmod=hclust(dist.mod,method="ward.D2")
plot(hclusmod)
plot(hclusmod$height[112:100])
hclasmod = cutree(hclusmod,k=4)
clas.mod=kmeans(afc$var$coord, 4)
kclasmod=clas.mod$cluster
# comparaison des classes entre CAH et k-means
table(hclasmod,kclasmod)
kclasmod
hclasmod 1 2 3 4
1 0 0 16 15
2 0 58 0 0
3 10 0 0 0
4 9 2 3 0
les modalitées qui carracterisent la class spam sont ceux les plus proche du centre spam (en bleu) les modalitées indifférentiables sont les modalitées qui sont à une distance plus au moin proche des 2 modalitées
plot.MCA(afc,invisible=c("ind"),
col.var=as.integer(clas.mod$cluster))
plot(afc$ind$coord,type="p",pch=".",cex=2,
col=as.factor(spam.quali[,58]))
NA
NA
NA
NA
library(NMF)
creux=as.matrix(spam[,1:57])
creux=data.frame(matrix(as.numeric(as.matrix(spam[,1:57])),ncol=57))
classe=spam[,58]
creux=cbind(log(1+creux[,1:54]),log(1+creux[,55:57])/2)
boxplot(creux)
# souci pour la suite :
sum(apply(creux,1,sum)==0)
[1] 0
# 3 messages sont devenus tout à 0
# suppression
ident=apply(creux,1,sum)!=0
creux=creux[ident,]
classe=classe[ident]
nmf.spam=nmf(creux,5,method="snmf/l",nrun=30,seed=111)
summary(nmf.spam)
rank sparseness.basis sparseness.coef silhouette.coef
5.0000000 0.4096215 0.6735297 0.7010137
silhouette.basis residuals niter cpu
0.5780159 5339.3415865 250.0000000 NA
cpu.all nrun cophenetic dispersion
NA 30.0000000 0.9959198 0.9009829
silhouette.consensus
0.9263145
s=featureScore(nmf.spam)
summary(s)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.019 0.135 0.232 0.243 0.319 0.688 4232
s=extractFeatures(nmf.spam)
str(s)
List of 5
$ : Named int NA
..- attr(*, "names")= chr NA
$ : Named int NA
..- attr(*, "names")= chr NA
$ : Named int NA
..- attr(*, "names")= chr NA
$ : Named int NA
..- attr(*, "names")= chr NA
$ : Named int NA
..- attr(*, "names")= chr NA
- attr(*, "method")= chr "kim"
# les matrices de facteurs
w=basis(nmf.spam)
h=coef(nmf.spam)
basismap(nmf.spam,annRow=classe,hclustfun="ward")
The "ward" method has been renamed to "ward.D"; note new "ward.D2"
coefmap(nmf.spam,hclustfun="ward")
The "ward" method has been renamed to "ward.D"; note new "ward.D2"
dist.mod=dist(t(h), method="euclidean")
hclusmod.h=hclust(dist.mod,method="ward.D2")
plot(hclusmod)
plot(hclusmod$height[56:46])
mdspam= cmdscale(dist.mod, k=2)
dN=dimnames(h)[[2]]
plot(mdspam, type="n", xlab="", ylab="",main="")
text(mdspam,dN)
abline(v=0,h=0)
dist.mod=dist(scale(t(h)), method="eucl")
mdspam= cmdscale(dist.mod, k=2)
hclusmod.h=hclust(dist.mod,method="ward.D2")
plot(hclusmod.h)
plot(hclusmod.h$height[56:46])
hclasmod = cutree(hclusmod.h,k=4)
plot(mdspam, type="n", xlab="", ylab="",main="")
text(mdspam,dN,col=hclasmod)
abline(v=0,h=0)
NA
NA
#classificaiton des messages à partir de w
dist.mod=dist(scale(w), method="euclidean")
hclusmod.w=hclust(dist.mod,method="ward.D2")
plot(hclusmod.w)
# intégration des deux classifications
aheatmap(creux,Rowv=hclusmod.w,
Colv=hclusmod.h,annRow=classe,
annCol=as.factor(hclasmod))